# -*- coding: utf-8 -*-

"""
Datetime: 2019/6/11
Author: Zhang Yafei
Description: 爬虫Spider
"""
import os
import re
from urllib.parse import urljoin

from engine import Request
from settings import TO_FILE
import pandas as pd


class XiaohuaSpider(object):
    """ 自定义Spider类 """
    # 1. 自定义起始url列表
    start_urls = [f'http://www.xiaohuar.com/list-1-{i}.html' for i in range(4)]

    def filter_downloaded_urls(self):
        """ 2. 添加过滤规则 """
        # self.start_urls = self.start_urls
        pass

    def start_request(self):
        """ 3. 将请求加入请求队列（集合），发送请求 """
        for url in self.start_urls:
            yield Request(url=url, callback=self.parse)

    async def parse(self, response):
        """ 4. 拿到请求响应，进行数据解析 """
        html = await response.text(encoding='gbk')
        reg = re.compile('<img width="210".*alt="(.*?)".*src="(.*?)" />')
        results = re.findall(reg, html)
        for name, src in results:
            img_url = src if src.startswith('http') else urljoin('http://www.xiaohuar.com', src)
            yield {'img_url': img_url, 'name': name}
            # yield Request(url=img_url, callback=self.store_data, meta={'name': name})

    @staticmethod
    async def store_data(response):
        """ 5. 数据存储 """
        print(response.request.meta.get('name'))

